mgcViz is an extension of the mgcv R package, which extends the basic visualizations provided by mgcv using ggplot2.

Modelling the GEFCom14 electricity demand data

  1. Load data and have a look at it
library(testGam)
data("gefcom_small")
head(gefcom_small)
##   Year NetDemand       wM       Posan        Trend Dow    wM_s95
## 1 2005  2.889407 16.38889 0.001370019 0.0000000000   6  8.236849
## 2 2005  2.543342 17.36111 0.004110058 0.0003961965   0  9.933947
## 3 2005  2.446159 18.88889 0.006850097 0.0007923930   1 10.794862
## 4 2005  2.190166 21.25000 0.009590136 0.0011885895   2 14.462260
## 5 2005  2.100094 20.83333 0.012330175 0.0015847861   3 16.194613
## 6 2005  2.130908 20.83333 0.015070213 0.0019809826   4 16.341327
##   NetDemand.24
## 1     2.889407
## 2     2.889407
## 3     2.543342
## 4     2.446159
## 5     2.190166
## 6     2.100094
plot(gefcom_small$NetDemand)

  1. Fit a basic Gaussian GAM
library(mgcv)
fit1 <- gam(formula = NetDemand ~ NetDemand.24 + Dow + Trend + 
                      s(wM) + s(wM_s95) + s(Posan, bs = "cc"), 
            data = gefcom_small)

Visualise the seasonal effect

plot(fit1, select = 3, scale = FALSE)

This calls plot.gam

args(plot.gam)
## function (x, residuals = FALSE, rug = NULL, se = TRUE, pages = 0, 
##     select = NULL, scale = -1, n = 100, n2 = 40, n3 = 3, pers = FALSE, 
##     theta = 30, phi = 30, jit = FALSE, xlab = NULL, ylab = NULL, 
##     main = NULL, ylim = NULL, xlim = NULL, too.far = 0.1, all.terms = FALSE, 
##     shade = FALSE, shade.col = "gray80", shift = 0, trans = I, 
##     seWithMean = FALSE, unconditional = FALSE, by.resids = FALSE, 
##     scheme = 0, ...) 
## NULL

Limitations of plot.gam:

mgcViz solves these problems using a layered system based on ggplot2. To use it, we need to object of class gamViz

library(mgcViz)
fit1_v <- getViz(fit1)

class(fit1)
## [1] "gam" "glm" "lm"
class(fit1_v)
## [1] "gamViz" "gam"    "glm"    "lm"

We can extract individual effects using

e1 <- sm(fit1_v, 3)
class(e1)
## [1] "cyclic.smooth"  "mgcv.smooth.1D"

plot effect using effect-specific methods

pl1 <- plot(e1) # calls plot.mgcv.smooth.1D()

then add layers and render

pl1 <- pl1 + l_fitLine() + l_ciLine()
pl1 # calls print.plotSmooth

In one step

plot(sm(fit1_v, 3)) + l_fitLine() + l_ciLine()

There is a variety of layers available

pl1 <- plot(sm(fit1_v, 3)) + l_ciPoly(level = 0.99) + l_fitLine(color = "red") + l_rug() 
pl2 <- plot(sm(fit1_v, 3), nsim = 20) + l_simLine()

gridPrint(pl1, pl2, ncol = 2)

To get list of layers do

listLayers( plot(sm(fit1_v, 3)) )
## [1] "l_ciLine"  "l_ciPoly"  "l_dens2D"  "l_fitDens" "l_fitLine" "l_points" 
## [7] "l_rug"     "l_simLine"

You can also use ggplot2 layers

plot(sm(fit1_v, 3)) + l_fitLine() + l_ciLine() + xlim(c(0.25, 0.75))

Layers from mgcViz always start with the l_ prefix.

For routine work you can use default plots, in mgcv

plot(fit1, select = 1, scale = FALSE) # mgcv

in mgcViz

plot(fit1_v, select = 1) # mgcViz

Plotting all terms in mgcv

plot(fit1, all.terms = TRUE, pages = 1)

in mgcViz

print(plot(fit1_v, allTerms = TRUE), pages = 1)

So workflow is

fit1 <- gam(formula = NetDemand ~ NetDemand.24 + Dow + Trend + 
                      s(wM) + s(wM_s95) + s(Posan, bs = "cc"), 
            data = gefcom_small)

fit1_v <- getViz( fit1 )

Alternatively, using shortcut:

fit1_v <- gamV(NetDemand ~ NetDemand.24 + Dow + Trend + 
                           s(wM) + s(wM_s95) + s(Posan, bs = "cc"), 
               data = gefcom_small)

In general:

Notice that gamV has fewer arguments

args(gam)
## function (formula, family = gaussian(), data = list(), weights = NULL, 
##     subset = NULL, na.action, offset = NULL, method = "GCV.Cp", 
##     optimizer = c("outer", "newton"), control = list(), scale = 0, 
##     select = FALSE, knots = NULL, sp = NULL, min.sp = NULL, H = NULL, 
##     gamma = 1, fit = TRUE, paraPen = NULL, G = NULL, in.out = NULL, 
##     drop.unused.levels = TRUE, drop.intercept = NULL, ...) 
## NULL
args(gamV)
## function (formula, family = gaussian(), data = list(), method = "REML", 
##     aGam = list(), aViz = list()) 
## NULL

So to do

b <- gam(formula = f, data = d, 
         weights = w, select = TRUE, gamma = 1.5, knots = kn, sp = ms)

b <- getViz(b, 
            nsim = 50, post = TRUE)

we need to do

b <- gamV(formula = f, data = d, 
          aGam = list(weights = w, select = TRUE, gamma = 1.5, knots = kn, sp = ms), 
          aViz = list(nsim = 50, post = TRUE))

So, going back to our example

fit1_v <- gamV(NetDemand ~ NetDemand.24 + Dow + Trend + 
                           s(wM) + s(wM_s95) + s(Posan, bs = "cc"), 
               data = gefcom_small, 
               aViz = list(nsim = 50))

We check the residuals as a function of time (trend):

plot(gefcom_small$Trend, residuals(fit1_v), ylim = c(-1, 1))

Using mgcViz

check1D(fit1_v, x = "Trend") + l_gridCheck1D()

We need a smooth effect for trend. Same checks for several effects

pl <- check1D(fit1_v, x = list("Trend", "wM", "Posan", "Dow")) + l_gridCheck1D()
print(pl, pages = 1)

Looking at basis dimension:

par(mfrow = c(2, 2))
gam.check(fit1_v)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-0.0002371614,0.0002097874]
## (score -397.8729 & scale 0.04002065).
## Hessian positive definite, eigenvalue range [3.292137,1255.534].
## Model rank =  35 / 35 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##             k'  edf k-index p-value    
## s(wM)     9.00 8.60    0.91  <2e-16 ***
## s(wM_s95) 9.00 8.13    1.02    0.80    
## s(Posan)  8.00 7.66    1.04    0.96    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Same with mgcViz

check(fit1_v) # calls check.gamViz
## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-0.0002371614,0.0002097874]
## (score -397.8729 & scale 0.04002065).
## Hessian positive definite, eigenvalue range [3.292137,1255.534].
## Model rank =  35 / 35 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##             k'  edf k-index p-value    
## s(wM)     9.00 8.60    0.91  <2e-16 ***
## s(wM_s95) 9.00 8.13    1.02    0.76    
## s(Posan)  8.00 7.66    1.04    0.97    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

edf is quite close to k' for all effects, and we get a very low p-value for wM. We have to increase k a bit.

fit2_v <- gamV(NetDemand ~ NetDemand.24 + Dow + s(Trend, k = 6) + s(wM, k = 20) + 
                           s(wM_s95, k = 15) + s(Posan, k = 15, bs = "cc"), 
             data = gefcom_small,
             aViz = list(nsim = 50))

Re-check residuals

pl <- check1D(fit2_v, x = list("Trend", "NetDemand.24", "wM", "Posan")) + l_gridCheck1D()
print(pl, pages = 1)

Look at QQ-plots:

qq(fit2_v, rep = 100, CI = "quantile") # calls qq.gamViz

We change to a Student-t distribution

fit3_v <- gamV(NetDemand ~ NetDemand.24 + Dow + s(Trend, k = 6) + s(wM, k = 20) + 
               s(wM_s95, k = 15) + s(Posan, k = 15, bs = "cc"), data = gefcom_small, 
               family = scat,
               aViz = list(nsim = 50))

AIC(fit2_v, fit3_v)
##              df       AIC
## fit2_v 49.98509 -1187.255
## fit3_v 52.62095 -1469.572
qq(fit3_v, rep = 100, CI = "quantile")

Larynx cancer in Germany

Load the data

library(testGam)
library(mgcv)
data("Larynx")

head(Larynx)
##   region         E  Y  x
## 1      0  7.965008  8 56
## 2      1 22.836219 22 65
## 3      2 22.094716 19 50
## 4      3  7.919352  3 63
## 5      4 13.139889 14 65
## 6      5 15.898848  9 51

Load geometry of regions

data("german.polys")
str(german.polys[1:5])
## List of 5
##  $ 0: num [1:10, 1:2] 2535 2554 2577 2577 2592 ...
##  $ 1: num [1:20, 1:2] 2988 2955 2934 2967 2924 ...
##  $ 2: num [1:30, 1:2] 3490 3498 3537 3498 3476 ...
##  $ 3: num [1:10, 1:2] 2927 2871 2871 2876 2859 ...
##  $ 4: num [1:46, 1:2] 2363 2377 2426 2440 2516 ...

Calculate regions’ centres and add them to data set

X <- t( sapply(german.polys, colMeans, na.rm=TRUE) )
Larynx$lo <- X[ , 1]
Larynx$la <- X[ , 2]

Now fit model including only the effect of smoking

fit1 <- gamV(Y ~ s(x, k=20),
             family = poisson, 
             data=Larynx, 
             aViz = list(nsim = 100))

Check mean residuals, binned across space

check2D(fit1, x1 = Larynx$lo, x2 = Larynx$la) + l_gridCheck2D()

The massive positive outlier is Berlin! We forgot the offset (i.e. number of cancers proportional to population of region)

fit2 <- gamV(Y ~ s(x, k=20) + offset( log(E) ),
             family = poisson, 
             data=Larynx, 
             aViz = list(nsim = 100))

Repeat the check

check2D(fit2, x1 = Larynx$lo, x2 = Larynx$la) + l_gridCheck2D()

There is still some spatial effect. We can look at it also by-region

check1D(fit2, x = Larynx$region) + l_gridCheck1D(showReps = FALSE, level = 0.95)

Add MRF effect to capture spatial effect

fit3 <- gamV(Y ~ s(region, k = 200, bs="mrf", xt=list(polys=german.polys)) + 
                 s(x, k=20) + offset( log(E) ),
             family = poisson, 
             data=Larynx, 
             aViz = list(nsim = 100))

Repeat the checks and compare

library(viridis)
ck1 <- check2D(fit2, x1 = Larynx$lo, x2 = Larynx$la) + l_gridCheck2D() + ggtitle("Before") + 
       scale_fill_gradientn(colours = viridis(50, begin = 0.2), limits = c(-5, 5))

ck2 <- check2D(fit3, x1 = Larynx$lo, x2 = Larynx$la) + l_gridCheck2D() + ggtitle("After")  + 
       scale_fill_gradientn(colours = viridis(50, begin = 0.2), limits = c(-5, 5))

gridPrint(ck1, ck2, ncol = 2)

Looking at the mean residuals by region

ck1 <- check1D(fit2, x = Larynx$region) + l_gridCheck1D(showReps = FALSE, level = 0.95) + ylim(c(-5, 5)) 
ck2 <- check1D(fit3, x = Larynx$region) + l_gridCheck1D(showReps = FALSE, level = 0.95) + ylim(c(-5, 5))
gridPrint(ck1, ck2, ncol = 2)

Looks better! We look at the spatial effect

plot(fit3, select = 1)

Now we use an isotropic smooth instead

fitISO <- gamV(Y ~ s(lo, la, k = 200) + 
                 s(x, k=20) + offset( log(E) ),
             family = poisson, 
             data=Larynx)

Visualise the effect

plot(fitISO, select = 1)

Visualise in 3D using rgl package

plotRGL( sm(fitISO, 1), residuals = TRUE )

NB: plotRGL is not layered, it is a classic function

args(plotRGL.mgcv.smooth.2D)
## function (x, se = TRUE, n = 40, residuals = FALSE, type = "auto", 
##     maxpo = 1000, too.far = 0, xlab = NULL, ylab = NULL, main = NULL, 
##     xlim = NULL, ylim = NULL, se.mult = 1, trans = identity, 
##     seWithMean = FALSE, unconditional = FALSE, ...) 
## NULL

–> –> –>

–> –>